import os
from collections import Counter
import pandas as pd


class LaborStatistical:
    def __init__(self):
        self.author_url_list = []
        self.author_url_dict = {}
        self.author_info = []
        self.my_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        # csv_path：读取的csv文件(爬虫产物)
        self.csv_path = os.path.join(self.my_BASE_DIR, 'static', 'csv_collect', 'deep_articles.csv')
        # self.csv_path = os.path.join(self.my_BASE_DIR, 'static', 'csv_collect', 'deep_articles.xlsx')

    def statistical(self):
        # 读取数据
        df = pd.read_csv(self.csv_path, sep=",", header=0, encoding="utf-8")

        # 清洗数据 生成目标列表
        for i in range(len(df)):
            self.extract_author_url(df["author_url"][i])

        # 使用Counter函数统计每个数字出现的次数
        self.author_url_dict = dict(Counter(self.author_url_list))
        # 使用sorted()函数和lambda函数，按照字典的值排序
        # sorted()函数返回一个由键值对组成的列表
        sorted_list = sorted(self.author_url_dict.items(), key=lambda x: x[1], reverse=True)

        # 使用dict()函数，将列表转换为一个新的字典
        self.author_url_dict = dict(sorted_list)
        # print(self.author_url_dict)

        position = 1
        total = len(self.author_url_dict)

        for word, count in self.author_url_dict.items():
            print(f"正在分析，第{position}/{total}条数据")
            # print(f"词语'{word}'出现的次数：{count}")
            # 从csv里面找到对应的author_url，然后从这一行找到作者信息
            temp_df = pd.read_csv(self.csv_path, sep=",", header=0, encoding="utf-8")
            for i in range(len(temp_df)):
                if word == temp_df["author_url"][i]:
                    self.author_info.append({
                        "author_url": temp_df["author_url"][i],
                        "author_lever": temp_df["author_lever"][i],
                        "author": temp_df["author"][i],
                        "author_articles": temp_df["author_articles"][i],
                        "author_watchs": temp_df["author_watchs"][i],
                        "author_fans": temp_df["author_fans"][i],
                        "count": count
                    })
                    break
            position = position + 1

        # 打印一下self.author_info看看
        print(self.author_info)

        return self.author_info

    def extract_author_url(self, topic):
        # 给定的时间字符串可能为NOTFOUND或者空字符
        if "" != topic and "NOTFOUND" != topic:
            self.author_url_list.append(topic)


# 以脚本方式启动
if __name__ == "__main__":
    # 捕捉异常错误
    try:
        ls = LaborStatistical()
        ls.statistical()
    except Exception as e:
        print("错误:", e)
